library(rstudioapi)
library(stringr)
library(pivottabler)
library(data.table)
library(plotly)
library(pheatmap)
library(viridis)
library(knitr)
app_dir <- str_split(rstudioapi::getActiveDocumentContext()$path, 'icb_data_exploration_and_analysis')[[1]][1]
app_dir <- paste0(app_dir, 'icb_data_exploration_and_analysis')
setwd(app_dir)

dataset_names <- list.dirs(file.path(app_dir, '.local_data'), full.names = FALSE, recursive = FALSE)

study_colnames <- list()
for(study in dataset_names){
  study_df <- read.csv(file.path(app_dir, '.local_data', study, paste0(study, '_metadata.tsv')), sep='\t')
  study_colnames[[study]] <- colnames(study_df)
}

common_colnames <- Reduce(intersect, study_colnames)
common_colnames <- c(common_colnames, c('Treatment.regimen', 'Drug.targets', 'Metastasis.status', 'Monotherapy', 'Combination'))
# common_colnames <- common_colnames[!common_colnames %in% c("TMB_raw", "nsTMB_raw", "indel_TMB_raw", "indel_nsTMB_raw", "TMB_perMb", "nsTMB_perMb", "indel_TMB_perMb", "indel_nsTMB_perMb", "CIN", "CNA_tot", "AMP", "DEL")]
# df_merged <- data.frame(matrix(NA, 0, length(c('study', common_colnames)), dimnames=list(c(), c('study', common_colnames))), stringsAsFactors=F)
df_merged <- read.csv(file.path(app_dir, 'Data', 'all_metadata.tsv'), sep='\t')
total_rows <- 0

df_metadata <- data.frame(matrix(NA, length(common_colnames), length(dataset_names), dimnames=list(common_colnames, dataset_names)), stringsAsFactors=F)
df_num_genes <- data.frame(matrix(NA, length(dataset_names), 2, dimnames=list(dataset_names, c('num_patients', 'num_genes'))), stringsAsFactors=F)

all.features <- list()
for(study in dataset_names){
  study_df <- df_merged[df_merged$study == study, ]
  study_df <- cbind(study = study, study_df)
  
  expr_df <- NA
  if(file.exists(file.path(app_dir, '.local_data', study, paste0(study, '_expr.tsv')))){
    expr_df <- read.csv(file.path(app_dir, '.local_data', study, paste0(study, '_expr.tsv')), sep='\t')
  }else{
    expr_df <- read.csv(file.path(app_dir, '.local_data', study, paste0(study, '_expr_gene_tpm.tsv')), sep='\t') 
  }
  all.features[[study]] <- rownames(expr_df)
  
  df_metadata[, study] <- unlist(lapply(common_colnames, function(col_name){
    sum(!is.na(study_df[, col_name]))
  }))

  df_num_genes[study, 'num_patients'] <- ncol(expr_df)
  df_num_genes[ study, 'num_genes'] = nrow(expr_df)
}

Number of available genes in each dataset

df_num_genes <- df_num_genes[order(df_num_genes$num_genes, decreasing = TRUE), ]
kable(df_num_genes, caption='')
num_patients num_genes
ICB_Gide 41 61544
ICB_Hugo 27 61544
ICB_Jung 27 61544
ICB_Kim 45 61544
ICB_Riaz 46 61544
ICB_Miao1 33 57820
ICB_Van_Allen 42 57820
ICB_Nathanson 24 52230
ICB_Braun 181 40994
ICB_Snyder 25 34906
ICB_VanDenEnde 35 31031
ICB_Mariathasan 348 23005
ICB_Padron 45 21366
ICB_Liu 121 20212
ICB_Puch 55 18789
ICB_Shiuan 15 17235
ICB_Roh 16 721
ICB_Jerby_Arnon 112 587
ICB_Hwang 21 384

Patient metadata availablility

matrix_metadata <- as.matrix(df_metadata)
excluded_metadata <- c('patientid', 'treatmentid', 'dna', 'rna')
included_metadata <- rownames(matrix_metadata)[!rownames(matrix_metadata) %in% excluded_metadata]
excluded_datasets <- c()
matrix_metadata <- matrix_metadata[included_metadata, !colnames(matrix_metadata) %in% excluded_datasets]

heatmap(matrix_metadata, Colv = NA, Rowv = NA, scale="column")

pt <- PivotTable$new()
pt$addData(df_merged)
pt$addColumnDataGroups("response")
pt$addRowDataGroups("study", addTotal=FALSE)
pt$addRowDataGroups("sex", addTotal=FALSE)
pt$addRowDataGroups("cancer_type", addTotal=FALSE)
pt$defineCalculation(calculationName="TotalPatients", summariseExpression="n()")
pt$renderPivot()

Common genes across datasets

excluded_datasets <- c('ICB_Hwang', 'ICB_Jerby_Arnon', 'ICB_Roh')
dataset_names_common_genes <- dataset_names[!dataset_names %in% excluded_datasets]
temp.features <- all.features[names(all.features)[!names(all.features) %in% excluded_datasets]]

lapply(temp.features,head) # everything looks good!
## $ICB_Braun
## [1] "ENSG00000000003.10" "ENSG00000000005.5"  "ENSG00000000419.8" 
## [4] "ENSG00000000457.9"  "ENSG00000000460.12" "ENSG00000000938.8" 
## 
## $ICB_Gide
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Hugo
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Jung
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Kim
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Liu
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Mariathasan
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Miao1
## [1] "ENSG00000000003.10" "ENSG00000000005.5"  "ENSG00000000419.8" 
## [4] "ENSG00000000457.9"  "ENSG00000000460.12" "ENSG00000000938.8" 
## 
## $ICB_Nathanson
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Padron
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Puch
## [1] "ENSG00000000003.10" "ENSG00000000005.5"  "ENSG00000000419.8" 
## [4] "ENSG00000000457.9"  "ENSG00000000460.12" "ENSG00000000938.8" 
## 
## $ICB_Riaz
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Shiuan
## [1] "ENSG00000000003.15" "ENSG00000000005.6"  "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
## 
## $ICB_Snyder
## [1] "ENSG00000000003.10" "ENSG00000000005.5"  "ENSG00000000419.8" 
## [4] "ENSG00000000457.9"  "ENSG00000000460.12" "ENSG00000000938.8" 
## 
## $ICB_Van_Allen
## [1] "ENSG00000000003.10" "ENSG00000000005.5"  "ENSG00000000419.8" 
## [4] "ENSG00000000457.9"  "ENSG00000000460.12" "ENSG00000000938.8" 
## 
## $ICB_VanDenEnde
## [1] "ENSG00000003056.3" "ENSG00000005073.5" "ENSG00000006116.3"
## [4] "ENSG00000008197.4" "ENSG00000019995.6" "ENSG00000039139.9"
temp <- unlist(temp.features)
temp <- unique(temp) # this is the 93819 features Jonas reported

num_datasets <- length(dataset_names_common_genes)
i=1
for (i in 2:num_datasets) {
  temp <- base::intersect(temp,temp.features[[i]])
}

temp.table <- matrix(nrow=num_datasets, ncol=num_datasets)
for (i in 1:num_datasets) {
  for (j in 1:num_datasets) {
    temp.table[i,j] <- length(base::intersect(temp.features[[i]],temp.features[[j]]))
  }
}

colnames(temp.table) <- names(temp.features)
rownames(temp.table) <- names(temp.features)

range(temp.table) # 160 61544
## [1]   160 61544
pheatmap(log10(temp.table), color = viridis(30))

datasets_to_remove <- c('ICB_Puch', 'ICB_Shiuan', 'ICB_Liu', 'ICB_Padron', 'ICB_Mariathasan')
# datasets_to_remove <- c()

available_datasets <- df_merged[!df_merged$study %in% excluded_datasets, ]
all_patients <- length(rownames(available_datasets))
all_genes <- range(temp.table)[2]

available_datasets <- available_datasets[!available_datasets$study %in% datasets_to_remove, ]
available_patients <- length(rownames(available_datasets))

overlapping_genes <- temp.table[!rownames(temp.table) %in% datasets_to_remove, !colnames(temp.table) %in% datasets_to_remove]
available_genes <- range(overlapping_genes)[1]

# paste0('total genes ', all_genes)
# paste0('total patients ', all_patients)
# paste0('available patients ', available_patients)
# paste0('available genes ', available_genes)
colors <- c('#FF7F0E', '#c6c6c6')
available_patients_pct <- (available_patients/all_patients) * 100
patients <- data.frame(
  availability=c('available', 'unavailable'), 
  percentage=c(available_patients_pct, 100 - available_patients_pct)
)
pie_patients <- plot_ly(
  patients, 
  labels = ~availability, 
  values = ~percentage, 
  marker = list(colors = colors),
  type = 'pie'
)
pie_patients <- pie_patients %>% layout(title = 'Available Patients',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

available_genes_pct <- (available_genes/all_genes) * 100
genes <- data.frame(
  availability=c('available', 'unavailable'), 
  percentage=c(available_genes_pct, 100 - available_genes_pct)
)
pie_genes <- plot_ly(
  genes, 
  labels = ~availability, 
  values = ~percentage, 
  marker = list(colors = colors),
  type = 'pie'
)
pie_genes <- pie_genes %>% layout(title = 'Available Genes',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

pie_patients
pie_genes